Homework_4_HS650_Fall_2021.Rmd.
library(rvest)
library(tm)
library(wordcloud)
library(plotly)
library(e1071)
library(gmodels)
library(C50)
library(caret)
library(rpart)
library(rpart.utils)urlSOCR <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_2011_US_JobsRanking#2011_Ranking_of_the_200_most_common_Jobs_in_the_US")
socrDF <- html_table(html_nodes(urlSOCR, "table")[[1]])
jsdDF <- socrDF[-1]
jsdDF$Description <- gsub('_', ' ', jsdDF$Description)
#jsdDF$Job_Title <- gsub('_', ' ', jsdDF$Job_Title)
plot_ly(jsdDF, x = ~Stress_Category, y = ~Hiring_Potential, type="box") %>%
layout(title = "Hiring Potential Given Stress Category", xaxis = list(title = "Stress Category"), yaxis = list(title = "Hiring Potential"))plot_ly(jsdDF, x = ~Hiring_Potential, type="histogram") %>%
layout(title = "Hiring Potential Distribution", xaxis = list(title = "Hiring Potential"), bargap=0.1)plot_ly(jsdDF, x = ~Stress_Category, type="histogram") %>%
layout(title = "Stress Category Distribution", xaxis = list(title = "Stress Category"), bargap=0.1)plot_ly(jsdDF, x = ~Stress_Category, y = ~`Average_Income(USD)`, type="box") %>%
layout(title = "Average Income (USD) Given Stress Category", xaxis = list(title = "Stress Category"), yaxis = list(title = "Average Income (USD)"))plot_ly(jsdDF, x = ~Physical_Demand, y = ~`Average_Income(USD)`, type="scatter") %>%
# add_lines(x = ~Physical_Demand, y= ~lm( Physical_Demand ~ `Average_Income(USD)`)) %>%
layout(title = "Average Income (USD) Given Physical Demand", xaxis = list(title = "Physical Demand"), yaxis = list(title = "Average Income (USD)"))corpusJSD <- Corpus(VectorSource(jsdDF$Description))
corpusJSD <- tm_map(corpusJSD, tolower)
corpusJSD <- tm_map(corpusJSD, removePunctuation)
corpusJSD <- tm_map(corpusJSD, stripWhitespace)
corpusJSD <- tm_map(corpusJSD, removeNumbers)
corpusJSD <- tm_map(corpusJSD, removeWords, stopwords('english'))
jsdDTM <- DocumentTermMatrix(corpusJSD)
set.seed(12345)
subsetInterval <- sample(nrow(jsdDF), floor(nrow(jsdDF) * 0.9)) # 90% training + 10% testing
jsdDFTrain <- jsdDF[subsetInterval, ]
jsdDFTest <- jsdDF[-subsetInterval, ]
jsdDTMTrain <- jsdDTM[subsetInterval, ]
jsdCorpusTrain <- corpusJSD[subsetInterval]
jsdCorpusTest <- corpusJSD[-subsetInterval]
#See Distribution of Stress Categories between Training vs Testing datasets
prop.table(table(jsdDFTrain$Stress_Category))##
## 0 1 2 3 4 5
## 0.06666667 0.43333333 0.32222222 0.11666667 0.05000000 0.01111111
prop.table(table(jsdDFTest$Stress_Category))##
## 1 2 3 4
## 0.35 0.30 0.15 0.20
#Binari"ize" Stress label for training and testing datasets
jsdDFTrain$Stress_High <- jsdDFTrain$Stress_Category %in% c(3:5)
jsdDFTrain$Stress_High <- factor(jsdDFTrain$Stress_High, levels = c(F, T), labels = c("Low_Stress", "High_Stress"))
jsdDFTest$Stress_High <- jsdDFTest$Stress_Category %in% c(3:5)
jsdDFTest$Stress_High <- factor(jsdDFTest$Stress_High, levels = c(F, T), labels = c("Low_Stress", "High_Stress"))
prop.table(table(jsdDFTrain$Stress_High))##
## Low_Stress High_Stress
## 0.8222222 0.1777778
prop.table(table(jsdDFTest$Stress_High))##
## Low_Stress High_Stress
## 0.65 0.35
# separate out low vs high stress from training data for graphics
jsdDFTrainLow <- subset(jsdDFTrain, Stress_High=="Low_Stress")
jsdDFTrainHigh <- subset(jsdDFTrain, Stress_High=="High_Stress")
jsdCorpusTrainLow <- Corpus(VectorSource(jsdDFTrainLow$Description))
jsdCorpusTrainLow <- tm_map(jsdCorpusTrainLow, tolower)
jsdCorpusTrainLow <- tm_map(jsdCorpusTrainLow, removePunctuation)
jsdCorpusTrainLow <- tm_map(jsdCorpusTrainLow, removeNumbers)
jsdCorpusTrainLow <- tm_map(jsdCorpusTrainLow, stripWhitespace)
jsdCorpusTrainLow <- tm_map(jsdCorpusTrainLow, removeWords, stopwords('english'))
jsdCorpusTrainHigh <- Corpus(VectorSource(jsdDFTrainHigh$Description))
jsdCorpusTrainHigh <- tm_map(jsdCorpusTrainHigh, tolower)
jsdCorpusTrainHigh <- tm_map(jsdCorpusTrainHigh, removeNumbers)
jsdCorpusTrainHigh <- tm_map(jsdCorpusTrainHigh, removePunctuation)
jsdCorpusTrainHigh <- tm_map(jsdCorpusTrainHigh, stripWhitespace)
jsdCorpusTrainHigh <- tm_map(jsdCorpusTrainHigh, removeWords, stopwords('english'))
jsdDTMTrainLow <- DocumentTermMatrix(jsdCorpusTrainLow)
jsdDTMTrainLow <- removeSparseTerms(jsdDTMTrainLow, 0.98)
jsdDTMTrainHigh <- DocumentTermMatrix(jsdCorpusTrainHigh)
jsdDTMTrainHigh <- removeSparseTerms(jsdDTMTrainHigh, 0.98)
# Low Stress Jobs
wordcloud(jsdCorpusTrainLow, min.freq = 7, random.order = FALSE, colors = brewer.pal(4, "Dark2"))wordSumLow <- colSums(as.matrix(jsdDTMTrainLow))
plot_ly(x = ~unname(wordSumLow), y = ~names(wordSumLow), type="bar", orientation="h") %>%
layout(title="Common Word Descriptions Low Stress Jobs", xaxis = list(title = "Words"), yaxis = list(title = "Counts"), bargap="0.1")# High Stress Jobs
wordcloud(jsdCorpusTrainHigh, min.freq = 3, random.order = FALSE, colors = brewer.pal(4, "Dark2"))wordSumHigh <- colSums(as.matrix(jsdDTMTrainHigh))
plot_ly(x = ~unname(wordSumHigh), y = ~names(wordSumHigh), type="bar", orientation="h") %>%
layout(title="Common Word Descriptions High Stress Jobs", xaxis = list(title = "Words"), yaxis = list(title = "Counts"), bargap="0.1")# Create Function to binary"ize" Yes/No whether word appears > 5 times in the DTM and Apply
convertToCounts <- function(wordFrequency) {
wordFrequency <- ifelse(wordFrequency > 0, 1, 0)
wordFrequency < factor(wordFrequency, levels = c(0, 1), labels = c("No", "Yes"))
return(wordFrequency)
}
#create training dictionary object & respective DTMs
jsdDict <- as.character(findFreqTerms(jsdDTMTrain, 5))
jsdDTMTrain <- DocumentTermMatrix(jsdCorpusTrain, list(dictionary = jsdDict))
jsdDTMTest <- DocumentTermMatrix(jsdCorpusTest, list(dictionary = jsdDict))
jsdTrain <- apply(jsdDTMTrain, MARGIN = 2, convertToCounts)
jsdTest <- apply(jsdDTMTest, MARGIN = 2, convertToCounts)
jsdBayesClassifier <- naiveBayes(jsdTrain, jsdDFTrain$Stress_High, laplace = 0)
jsdBayesPredict <- predict(jsdBayesClassifier, jsdTest, type="raw")
head(jsdBayesPredict)## Low_Stress High_Stress
## [1,] 1.380806e-48 1
## [2,] 5.474669e-37 1
## [3,] 4.923113e-36 1
## [4,] 8.311141e-48 1
## [5,] 3.161521e-38 1
## [6,] 6.766165e-37 1
# crossTable <- CrossTable(jsdBayesPredict, jsdDFTest$Stress_High)
# crossTablejsdTree50 <- C5.0(jsdTrain, jsdDFTrain$Stress_High)
jsdTree50##
## Call:
## C5.0.default(x = jsdTrain, y = jsdDFTrain$Stress_High)
##
## Classification Tree
## Number of samples: 180
## Number of predictors: 58
##
## Tree size: 1
##
## Non-standard options: attempt to group attributes
summary(jsdTree50)##
## Call:
## C5.0.default(x = jsdTrain, y = jsdDFTrain$Stress_High)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Oct 29 14:21:16 2021
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 180 cases (59 attributes) from undefined.data
##
## Decision tree:
## Low_Stress (180/32)
##
##
## Evaluation on training data (180 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 1 32(17.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 148 (a): class Low_Stress
## 32 (b): class High_Stress
##
##
## Time: 0.0 secs
jsdTree50Predict <- predict(jsdTree50, jsdTest)
confusionMatrix(table(jsdTree50Predict, jsdDFTest$Stress_High))## Confusion Matrix and Statistics
##
##
## jsdTree50Predict Low_Stress High_Stress
## Low_Stress 13 7
## High_Stress 0 0
##
## Accuracy : 0.65
## 95% CI : (0.4078, 0.8461)
## No Information Rate : 0.65
## P-Value [Acc > NIR] : 0.60103
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 1.00
## Specificity : 0.00
## Pos Pred Value : 0.65
## Neg Pred Value : NaN
## Prevalence : 0.65
## Detection Rate : 0.65
## Detection Prevalence : 1.00
## Balanced Accuracy : 0.50
##
## 'Positive' Class : Low_Stress
##
plot(jsdTree50, subtree = 1)fit <- lm(formula = Overall_Score ~ `Average_Income(USD)` + Work_Environment + Stress_Level + Stress_Category + Physical_Demand + Hiring_Potential, jsdDF)
summary(fit)##
## Call:
## lm(formula = Overall_Score ~ `Average_Income(USD)` + Work_Environment +
## Stress_Level + Stress_Category + Physical_Demand + Hiring_Potential,
## data = jsdDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -447.49 -32.61 3.68 34.93 232.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 304.015800 16.578635 18.338 < 2e-16 ***
## `Average_Income(USD)` -0.001157 0.000142 -8.150 4.48e-14 ***
## Work_Environment 0.167649 0.018709 8.961 2.76e-16 ***
## Stress_Level 2.555219 1.608503 1.589 0.114
## Stress_Category 7.975302 15.367294 0.519 0.604
## Physical_Demand 6.555979 0.817788 8.017 1.02e-13 ***
## Hiring_Potential -5.870516 0.402249 -14.594 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.68 on 193 degrees of freedom
## Multiple R-squared: 0.8904, Adjusted R-squared: 0.887
## F-statistic: 261.3 on 6 and 193 DF, p-value: < 2.2e-16
plot_ly(x=fit$fitted.values, y=fit$residuals, type="scatter", mode="markers") %>%
layout(title="LM: Fitted-values vs. Model-Residuals",
xaxis=list(title="Fitted"),
yaxis = list(title="Residuals"))step(fit,direction = "backward")## Start: AIC=1662.1
## Overall_Score ~ `Average_Income(USD)` + Work_Environment + Stress_Level +
## Stress_Category + Physical_Demand + Hiring_Potential
##
## Df Sum of Sq RSS AIC
## - Stress_Category 1 1058 759350 1660.4
## <none> 758292 1662.1
## - Stress_Level 1 9915 768207 1662.7
## - Physical_Demand 1 252507 1010798 1717.6
## - `Average_Income(USD)` 1 260999 1019291 1719.3
## - Work_Environment 1 315488 1073780 1729.7
## - Hiring_Potential 1 836837 1595129 1808.8
##
## Step: AIC=1660.38
## Overall_Score ~ `Average_Income(USD)` + Work_Environment + Stress_Level +
## Physical_Demand + Hiring_Potential
##
## Df Sum of Sq RSS AIC
## <none> 759350 1660.4
## - Stress_Level 1 97981 857331 1682.7
## - Physical_Demand 1 255719 1015069 1716.4
## - `Average_Income(USD)` 1 260039 1019389 1717.3
## - Work_Environment 1 316119 1075469 1728.0
## - Hiring_Potential 1 835814 1595164 1806.8
##
## Call:
## lm(formula = Overall_Score ~ `Average_Income(USD)` + Work_Environment +
## Stress_Level + Physical_Demand + Hiring_Potential, data = jsdDF)
##
## Coefficients:
## (Intercept) `Average_Income(USD)` Work_Environment Stress_Level Physical_Demand Hiring_Potential
## 299.952496 -0.001151 0.167797 3.315567 6.583580 -5.861739